setwd("~/Desktop/Research/R/support_test")
library(data.table)
library(tidyr)
library(xts)
source("hawkes_utilities.R")
save_bigdt_only = TRUE



# event size and price rounding must be ticker-dependent
event_size_by_ticker <- data.table(
  ticker = c("AAPL", "APRN", "LYFT", "TSG", "UBER"),
  size = c(100, 200, 100, 200, 100),
  price = c(1e4, 0.1e4, 0.5e4, 0.2e4, 1e4)
)
# all_files_sub <- all_files[grep("LYFT", all_files)]
# size_distr <- sapply(all_files_sub, function(data_file_name){
#   book_msg <- fread(paste0(data_dir, data_file_name, "_34200000_57600000_message_10.csv"))
#   colnames(book_msg) <- c("time", "event_type", "order_id", "size", "price", "direction", "note")
#   c(book_msg[, summary(size)], book_msg[,summary(price)])
# })
# apply(size_distr, 1, summary)


# load data ----------------------------------------------------------------------------------------------------------------
data_dir <- "~/Desktop/Research/lobster/"
save_dir <- "~/Disk/Research/image_support_test/"
if (grepl("/n/", getwd())){
  save_dir <- "/n/scratchlfs/kou_lab/shihaoyang/lob/"
  data_dir <- "/n/kou_lab/shihaoyang/lob/"
}

args <- commandArgs(trailingOnly=TRUE)
if(length(args) == 0){
  # AAPL_2019-01-09 only has 10 minutes of data, linear model gives NA on many places, code breaks
#   data_file_name <- "AAPL_2019-01-02"
    data_file_name <- "LYFT_2019-05-10"
}else{
  all_files <- list.files(data_dir)
  all_files <- all_files[grep("_34200000_57600000_", all_files)]
  all_files <- strsplit(all_files, "_34200000_57600000_")
  all_files <- sort(unique(sapply(all_files, function(x) x[1])))
  data_file_name <- all_files[as.numeric(args[1])]
}
print(data_file_name)
data_ticker <- strsplit(data_file_name, "_")[[1]][[1]]

book_msg <- fread(paste0(data_dir, data_file_name, "_34200000_57600000_message_10.csv"))
book_order <- fread(paste0(data_dir, data_file_name, "_34200000_57600000_orderbook_10.csv"))

level_no <- ncol(book_order) / 4

colnames(book_msg) <- c("time", "event_type", "order_id", "size", "price", "direction", "note")
colnames(book_order) <- paste0(c("ask_price_", "ask_size_", "bid_price_", "bid_size_"), ceiling(1:ncol(book_order) / 4))

book <- copy(book_order)
sapply(book, class)

# fix integer 64 9999999999 values
for(col in names(book)) set(book, i=which(book[[col]]==9999999999), j=col, value=10000000000)
for(col in names(book)) set(book, i=which(book[[col]]==-9999999999), j=col, value=-10000000000)

#
book[, mid_price := (ask_price_1 + bid_price_1) / 2]

book[, ref_price := mid_price]
if (book$ref_price[1] %% 100 == 0){
  book$ref_price[1] <- book$ref_price[1] + 50
}
book[, ref_price_change := ref_price - c(NA, ref_price[-nrow(book)])]

while(nrow(book[ref_price %% 100 == 0 & ref_price_change > 0]) > 0){
  book[ref_price %% 100 == 0 & ref_price_change > 0, ref_price := ref_price - 50]
  book[, ref_price_change := ref_price - c(NA, ref_price[-nrow(book)])]
  print(nrow(book[ref_price %% 100 == 0 & ref_price_change > 0]))
}

while(nrow(book[ref_price %% 100 == 0 & ref_price_change < 0]) > 0){
  book[ref_price %% 100 == 0 & ref_price_change < 0, ref_price := ref_price + 50]
  book[, ref_price_change := ref_price - c(NA, ref_price[-nrow(book)])]
  print(nrow(book[ref_price %% 100 == 0 & ref_price_change < 0]))
}

nrecord <- nrow(book)
stopifnot(nrecord == nrow(book_order))
stopifnot(nrecord == nrow(book_msg))
book[, stopifnot(all(abs(ref_price_change[-1] - diff(ref_price)) < 1e-4))]

book[, stopifnot(all(ref_price %% 100 == 50))]
book[, stopifnot(all(ref_price > bid_price_1))]
book[, stopifnot(all(ref_price < ask_price_1))]

# For each level, assign "q+/-" to ask/bid sizes.
for(k in 1:level_no){
  book[, paste0("q_plus_", k) := integer(nrow(book))]
  for(i in 1:k){
    book[get(paste0("ask_price_", i)) == ref_price + 50 + (k-1) * 100, 
         paste0("q_plus_", k) := get(paste0("ask_size_", i))]
  }
  for(i in setdiff(1:level_no, 1:k)){
    book[, stopifnot(all(get(paste0("ask_price_", i)) > ref_price + 50 + (k-1) * 100))]
  }
  
  book[, paste0("q_minus_", k) := integer(nrow(book))]
  for(i in 1:k){
    book[get(paste0("bid_price_", i)) == ref_price - 50 - (k-1) * 100, 
         paste0("q_minus_", k) := get(paste0("bid_size_", i))]
  }
  for(i in setdiff(1:level_no, 1:k)){
    book[, stopifnot(all(get(paste0("bid_price_", i)) < ref_price - 50 - (k-1) * 100))]
  }
}

summary(book$q_minus_1)
summary(book$q_plus_1)
barplot(colMeans(book[, mget(c(paste0("q_minus_", 10:1), paste0("q_plus_", 1:10)))]))

id <- book_msg[, which(event_type %in% c(5, 6, 7))]
# confirm event_type 5, 6, 7 does not change orderbook
id <- setdiff(id, 1)
# data.matrix will return NA. The program works by deleting data.matrix()
#stopifnot(all(data.matrix(book[id, mget(colnames(book_order))]) - 
#                data.matrix(book[id-1, mget(colnames(book_order))]) == 0))

stopifnot(all(book[id, mget(colnames(book_order))] - 
                book[id-1, mget(colnames(book_order))] == 0))
# confirmed that reference price construction is still correct if delete here 
# book in original order is orderbook shape right after the event
book <- book[-id]
book_event <- book_msg[-id]
# now book has orderbook shape right before the event. Original matching from Lobster data is after the event.
book <- cbind(book[-nrow(book)], book_event[-1])
nrecord <- nrow(book)
book[, stopifnot(all(abs(ref_price_change[-1] - diff(ref_price)) < 1e-4))]

stopifnot(book[,all(price %% 100 == 0)])
book[, modifying_queue := (book$price - book$ref_price) / 100]
book[, modifying_queue := sign(modifying_queue) * (abs(modifying_queue) + 0.5)]
book[, table(modifying_queue)]
book[!modifying_queue %in% -10:10, .N, by = modifying_queue]
# book <- book[modifying_queue %in% -10:10]
book[, stopifnot(all(abs(ref_price_change[-1] - diff(ref_price)) < 1e-4))]
stopifnot(nrecord == nrow(book))

book <- book[event_type %in% c(1,2,3,4,5)]
book[event_type == 1, event_type_coarsen := "+"]
book[event_type %in% c(2,3), event_type_coarsen := "-"]
book[event_type %in% c(4,5), event_type_coarsen := "t"]

book[, new_ref_price_after_event := c(ref_price[-1], NA)]
book[, ref_price_jump_after_event := new_ref_price_after_event - ref_price]

average_event_size <- book[,mean(size),by=modifying_queue]
average_event_size <- average_event_size[order(modifying_queue)]
predetermined_size_unit <- event_size_by_ticker[ticker == data_ticker, size]
if(length(predetermined_size_unit) != 1){
  predetermined_size_unit <- 100
}
average_event_size$V1 <- predetermined_size_unit

for(k in 1:level_no){
  book[, paste0("q_std_plus_", k) := get(paste0("q_plus_", k)) / average_event_size[modifying_queue == k, V1]]
  # Take the ceiling for each element in "q_std_plus_k" to make them integers.
  book[, paste0("q_std_plus_", k) := ceiling(get(paste0("q_std_plus_", k)))]
  book[, paste0("q_std_minus_", k) := get(paste0("q_minus_", k)) / average_event_size[modifying_queue == -k, V1]]
  book[, paste0("q_std_minus_", k) := ceiling(get(paste0("q_std_minus_", k)))]
}

book[, queue_and_event := paste0(sprintf("%+d", modifying_queue), "(", event_type_coarsen, ")")]

book[, bid_ask_spread := (ask_price_1 - bid_price_1)]
book[, bid_ask_spread := pmin(bid_ask_spread, 900) / 100]

book[, q_std_best_ask := ceiling(ask_size_1 / average_event_size[modifying_queue==1,V1])]
book[, q_std_best_bid := ceiling(bid_size_1 / average_event_size[modifying_queue==-1,V1])]

book[, ref_price_event := ""]
book[ref_price_jump_after_event > 0, ref_price_event := paste0("p+(", event_type_coarsen, ")")]
book[ref_price_jump_after_event < 0, ref_price_event := paste0("p-(", event_type_coarsen, ")")]
book[ref_price_jump_after_event != 0, table(ref_price_event)]
book[ref_price_jump_after_event != 0, table(ref_price_event, modifying_queue)]
# the common jump because of -, +, t, can assume to be modifying the least expensive queue
# the common jump due to + is impossible if the bid-ask spread is tight (1 tick)
# same as cancellation/trade is impossible if the queue has zero size
book[ref_price_jump_after_event == "p-(+)",stopifnot(all(direction == 1))]
book[ref_price_jump_after_event == "p+(+)",stopifnot(all(direction == -1))]

book[ref_price_jump_after_event != 0, queue_and_event := ref_price_event]
book[, sum(size), by=event_type_coarsen]

# Add time factor to book: 13/78/390 means 30/5/1-minutes data per group --------------------------------
# book[,time_30min := cut(book$time, 13, labels = c(1:13))]
# book[,time_5min := cut(book$time, 78, labels = c(1:78))]
#book[,time_1min := cut(book$time, 390, labels = c(1:390))]
# breaks_1 = c( 34200 + seq(0,3600,300),  37800 + seq(15*60, 16200, 15*60),   54000 + seq(300, 3600,300) )
# book[, time_customizedone := cut(book$time, breaks = breaks_1, labels =c(1:42))]

#breaks_2 = c( 34200 + seq(0,1800,60),  36000 + seq(300, 19800, 300),   55800 + seq(60, 1800,60) )
#book[, time_customizedtwo := cut(book$time, breaks = breaks_2, labels =c(1:126))]


# Hawkes-Markov model for orderbook process ------------------------------------------------
event <- dcast(book[modifying_queue %in% -3:3], 
               time ~ queue_and_event, value.var = "event_type_coarsen",
               fun.aggregate = length)
cols_all <- paste0(rep(sprintf("%+d", c(1:3, -(1:3))), each=3), "(", c("+", "-", "t"), ")")
cols_all <- c(cols_all, c("p+(+)", "p+(-)", "p+(t)", "p-(+)", "p-(-)", "p-(t)"))
for(i in setdiff(cols_all, colnames(event))){
  event[, eval(i) := 0]
}
event <- event[,mget(c("time", cols_all))]


stopifnot(all(colnames(event) != ""))
# 1 indicate rows, 2 indicate columns
stopifnot(all(apply(event[,-1] > 0, 1, any)))

state_func<- function(t){
  timestap <- data.table(time=t)
  book_state <- merge(book, timestap, by="time", all=TRUE)
  book_state <- book_state[order(time)]
  # Fill missing values
  for (i in colnames(book_state)[is.na(book_state[1,])]){
    book_state[[i]][1] <- book[[i]][1]
  }
  # Original direction is the default "down". Since book includes LOB state before event arrival,
  # direction should be "updown".
  for (i in colnames(book_state)){
    book_state <- book_state %>% fill(i, .direction = "updown")  
  }
  # Merge to match standard bin size
  book_state <- unique(book_state, by="time")
  book_state <- merge(book_state, timestap, by="time", all=FALSE)
  # mget function only applies to data_table type  book_state = data.table(book_state)
  book_state = data.table(book_state)
  breaks_2 = c( 34200 + seq(0,1800,60),  36000 + seq(300, 19800, 300),   55800 + seq(60, 1800,60) )
  book_state[, time_customizedtwo := cut(book_state$time, breaks = breaks_2, labels =c(1:126),include.lowest=TRUE,right = FALSE)]
  
  char_cols <- c("time", "note","ref_price_event", "queue_and_event", "event_type_coarsen")
  state <- data.matrix(book_state[,mget(setdiff(colnames(book_state), char_cols))])
  
  event_name <- paste(colnames(event)[-1], collapse = ".")
  paste0(rep(sprintf("%+d", c(1:3, -(1:3))), each=3), "(", c("+", "-", "t"), ")")
  e1 <- "+1(+).+1(-).+1(t).+2(+).+2(-).+2(t).+3(+).+3(-).+3(t).-1(+).-1(-).-1(t).-2(+).-2(-).-2(t).-3(+).-3(-).-3(t)"
  e2 <- "-3.-2.-1.1.2.3"
  e3 <- paste(e1, "p+(+).p+(-).p+(t).p-(+).p-(-).p-(t)", sep=".")
  stopifnot(event_name == e3)
  
  # time_30min <- state[, "time_30min"]
  # time_5min <- state[, "time_5min"]
#  time_1min <- state[, "time_1min"]
#  time_customizedone <- state[, "time_customizedone"]
  time_customizedtwo <- state[, "time_customizedtwo"]
#  time_customizedthree <- state[, "time_customizedthree"]
  # price <- state[,"ref_price"]
  # predetermined_price_unit <- event_size_by_ticker[ticker == data_ticker, price]
  # if(length(predetermined_price_unit) != 1){
  #   price_unit <- predetermined_price_unit
  # }else{
  #   price_unit <- 10000
  # }
  # price <- floor(price / price_unit)
  state <- state[,c("q_std_plus_1", "q_std_plus_1", "q_std_plus_1",
                    "q_std_plus_2", "q_std_plus_2", "q_std_plus_2",
                    "q_std_plus_3", "q_std_plus_3", "q_std_plus_3",
                    "q_std_minus_1", "q_std_minus_1", "q_std_minus_1",
                    "q_std_minus_2", "q_std_minus_2", "q_std_minus_2",
                    "q_std_minus_3", "q_std_minus_3", "q_std_minus_3",
                    "bid_ask_spread", # for p+(+)
                    "q_std_best_ask", # for p+(-)
                    "q_std_best_ask", # for p+(t)
                    "bid_ask_spread", # for p-(+)
                    "q_std_best_bid", # for p-(-)
                    "q_std_best_bid" # for p-(t)
  )]
  state[state > 9] <- 9
  state_list = list()
  state_list$state <- lapply(1:(ncol(event)-1), function(i){
    cbind(state=state[,i])
  })
  # state_list$price = price
  # state_list$time_30min = time_30min
  # state_list$time_5min = time_5min
#  state_list$time_1min = time_1min
#  state_list$time_customizedone = time_customizedone
  state_list$time_customizedtwo = time_customizedtwo
#  state_list$time_customizedthree = time_customizedthree
  return(state_list)
}
rm(book_event, book_msg, book_order)

# Set Parameters ---------------------------------------------------------------------------
delta = 0.01
cores = 12
save_dir_1 <- paste0(save_dir, "data_processed_nosize/")
support_seq = c(20)

for (support_max in support_seq){
   hawkes_markov_pref_price <- estimate_hawkes(
     event = event,
     state_func = state_func,
     delta = delta,
     support_max =support_max,
     mc.cores = cores,
     save.config=list(save_dir=save_dir_1, file_name=data_file_name, save_bigdt_only=save_bigdt_only))
 }



# event size must be considered as well ---------------------------------------------------------------

book[, event_size := size / predetermined_size_unit]
# # all regular orberbook events are additive: big event can be thought as sum of
# # same event with smaller size in short period of time
# # price chagne events are not additive: (-) and (t) will deplete the queue, and (+) needs to
# # draw from invariate distribution. In summary three types of event cannot be thought as sum of
# # same event with smaller size in short period of time
book[ref_price_jump_after_event != 0, event_size := 1]

event_new <- dcast(book[modifying_queue %in% -3:3],
                  time ~ queue_and_event, value.var = "event_size",
                  fun.aggregate = sum)
cols_all <- paste0(rep(sprintf("%+d", c(1:3, -(1:3))), each=3), "(", c("+", "-", "t"), ")")
cols_all <- c(cols_all, c("p+(+)", "p+(-)", "p+(t)", "p-(+)", "p-(-)", "p-(t)"))
for(i in setdiff(cols_all, colnames(event_new))){
 event_new[, eval(i) := 0]
}
event_new <- event_new[,mget(c("time", cols_all))]

save_dir_2 <- paste0(save_dir, "/data_processed_withsize/")

for (support_max in support_seq){
  hawkes_markov_pref_price_eventsize <- estimate_hawkes(
    event = event_new,
    state_func = state_func,
    delta = delta,
    support_max =support_max,
    mc.cores = cores,
    save.config=list(save_dir=save_dir_2, file_name=data_file_name, save_bigdt_only=save_bigdt_only))

}

  
# the model parameter lambda now interpretates as mean event size for next short period (0.5 seconds)
